## "Where are the missing dead" Appendix replication
##
## Reproduce: Prefecture-Year & Province-Month analysis
##
## 			  - Figure 7(a) & 7(b)
##			  - Appendix Table A7 & A8
##				- Appendix Figure A4
##
## 
## Input: - event_data.dta (event data)

## Guoer Liu
## 8/02/2022 


setwd("~/Dropbox/Missing dead/replication/")


library(haven)
library(scales)
library(estimatr)
library(data.table)
library(modelsummary) ## modelsummary(): for lm_robust object
library(extrafont) ## change fonts in plot
#font_import()
#loadfonts(device="all")       #Register fonts 
#fonts()

#install.packages('devtools', repos = 'http://cran.us.r-project.org') 
#devtools::install_github('xuyiqing/panelView') 
library(panelView)



### Function 


collapseEvent <- function(vector, unit, time, 
						  log = TRUE,
						  sum = FALSE,
						  count = FALSE){
    # Args:
    #   vector:  n x 1 vector to be collapsed by unit-time
    #   unit:    n x 1 vector of event unit
    #   time:    n x 1 vector of event time
    if (sum == TRUE){
    	out <- tapply(vector, list(unit, time), sum)
    		if (log == TRUE){
    		out <- log(out)
    		}
    	out <- cbind(rownames(out), out)
    	out <- as.data.table(out)
    	names(out)[1] <- "unit"
    	out <- melt(out, id = 1, na.rm = FALSE, 
    				variable.name = "time", 
    				value.name = "death")
    	out$unit <- as.numeric(out$unit)
    	out$death <- as.numeric(out$death)
    	out$prov_code <- as.numeric(substr(out$unit, 1, 2))
    	out$time <- as.numeric(as.character(out$time))
    }

    if (count == TRUE){
    	out <- tapply(vector, list(unit, time), length)
    		if (log == TRUE){
    		out <- log(out)
    		}
    	out <- cbind(rownames(out), out)
    	out <- as.data.table(out)
    	names(out)[1] <- "unit"
    	out <- melt(out, id = 1, na.rm = FALSE, 
    				variable.name = "time", 
    				value.name = "count")
    	out$unit <- as.numeric(out$pref_code)
    	out$count <- as.numeric(out$count)
    	out$prov_code <- as.numeric(substr(out$unit, 1, 2))
    	out$time <- as.numeric(as.character(out$time))

    }

 
    return(out)
}

dataCollapse <- function(vector, year.v, treat.ind){
    # Args:
    #   vector:     n x 1 vector to be collapsed by province-treat
    #   year.v:     n x 1 vector
    #   treat.ind:  n x 1 vector of treatment indicator
    out <- tapply(vector, 
                  list(year.v, treat.ind), 
                  function(x) mean(x, na.rm = T))
    return(out)
}


paraPlot <- function(collapMat, 
					matrix.x.lab = NULL, matrix.title = NULL,
					l.mar = 3.5,
					hline = NULL,
					x.axis.vec = NULL,
					y.lab = TRUE, y.lab.names = NULL,
					y.min = NULL, y.max = NULL,
					legend = TRUE, leg.pos = NULL){
	# Args:
	#	collapMat:      n x 2 matrix (column 1: control; column 2: treat)
	#   matrix.x.lab:   a string of figure x axis label
	#	matrix.title:	a string of figure title
	#	hline:			a scalar indicate the position of vertical line
	#	x.axis.vec:		a length n string vector that shows x axis
	#   y.lab:			logical parameter to turn off y label
	#   y.lab.names:    a string of y axis lable
	#   y.min:			a scalar indicates the minimum value on y axis
	#   y.max:			a scalar indicates the maximum value on y axis
	#   legend:			logical parameter to turn off legend
	#	leg.pos:		a string indicates legend position
	pch.cex <- 1.6
	ctl.v <- collapMat[, 1]
	tr.v <- collapMat[, 2]
	#y.max <- max(x)
	#y.min <- min(x)
	x.len <- nrow(collapMat)

	par(mar= c(3.2, l.mar, 1, 1))
	plot(NA, xlim = c(0.5, x.len), 
	         ylim = c(y.min, y.max), 
	         type = "n", 
	         xlab = matrix.x.lab,
	         main = matrix.title, 
	         ylab = "", axes = F, cex.lab = 1.3)
	grid(nx = NULL, ny = NULL,
     	 lty = 2,      # Grid line type
     	 col = "gray", # Grid line color
     	 lwd = 0.6)
	abline(v = hline, col = alpha("orangered", 0.8), lwd = 1.5)
	lines(ctl.v, lty = 2, lwd = 1.5)
	points(ctl.v, cex = pch.cex, pch = 1, col = "black")
	lines(tr.v, lty = 1, lwd = 1.5)
	points(tr.v, cex = pch.cex, pch = 19, col = "black", bg = "black")
	axis(1, at = seq(1, x.len, 1), label = x.axis.vec, 
		 cex.axis = 1, las = 1)
	if (y.lab == TRUE){
		axis(2, at = seq(ceiling(y.min), floor(y.max), 1), 
			label = seq(ceiling(y.min), floor(y.max), 1),
			cex.axis = 1, las = 1)
		title(ylab = y.lab.names, line = 2, cex.lab = 1)
	}
	if (legend == TRUE){
		legend(leg.pos, title = NULL, 
			   legend = c("Treated", "Control"), 
			   col = "black", 
			   lty = c(1, 2), cex = 0.8,
			   pch = c(19, 1),
			   lwd = c(1, 1))
	}
}


mydata <- read_dta("event_data.dta")
mydata <- as.data.frame(mydata)

mydata <- subset(mydata, year > 1999 & year < 2006)
mydata.traf <- mydata[mydata$category_manu == 5, ]
mydata.othr <- mydata[mydata$category_manu != 5, ]

treat_group_idx <- c("52", "36", "37", "53") ## by prov_code NOT province_CN 
									  ## same content, easier to subset data

## Prefecture-Year data

deathsum_pfy_lg <- collapseEvent(mydata$death, mydata$pref_code, mydata$year, 
				   			     sum = TRUE)
deathsum_pfy_lg$treat <- ifelse(deathsum_pfy_lg$prov_code %in% treat_group_idx,
							    1, 0)
deathsum_pfy_lg$time_ind <- ifelse(deathsum_pfy_lg$time >= 2004, 1, 0)
deathsum_pfy_lg$did <- deathsum_pfy_lg$treat * deathsum_pfy_lg$time_ind



trafsum_pfy_lg <- collapseEvent(mydata.traf$death, mydata.traf$pref_code, 
							  mydata.traf$year, sum = TRUE)
trafsum_pfy_lg$treat <- ifelse(trafsum_pfy_lg$prov_code %in% treat_group_idx,
							   1, 0)
trafsum_pfy_lg$time_ind <- ifelse(trafsum_pfy_lg$time >= 2004, 1, 0)
trafsum_pfy_lg$did <- trafsum_pfy_lg$treat * trafsum_pfy_lg$time_ind


othrsum_pfy_lg <- collapseEvent(mydata.othr$death, mydata.othr$pref_code, 
							    mydata.othr$year, sum = TRUE)
othrsum_pfy_lg$treat <- ifelse(othrsum_pfy_lg$prov_code %in% treat_group_idx,
							   1, 0)
othrsum_pfy_lg$time_ind <- ifelse(othrsum_pfy_lg$time >= 2004, 1, 0)
othrsum_pfy_lg$did <- othrsum_pfy_lg$treat * othrsum_pfy_lg$time_ind


pref.year <- list(all = deathsum_pfy_lg,
				  traffic = trafsum_pfy_lg,
				  others = othrsum_pfy_lg)


## Province-month data
deathsum_pm_lg <- collapseEvent(mydata$death, mydata$prov_code, mydata$month, 
				   			   sum = TRUE)
deathsum_pm_lg$treat <- ifelse(deathsum_pm_lg$prov_code %in% treat_group_idx,
							    1, 0)
deathsum_pm_lg$time_ind <- ifelse(deathsum_pm_lg$time >= 522, 1, 0)
deathsum_pm_lg$did <- deathsum_pm_lg$treat * deathsum_pm_lg$time_ind


trafsum_pm_lg <- collapseEvent(mydata.traf$death, mydata.traf$prov_code, 
							  mydata.traf$month, sum = TRUE)
trafsum_pm_lg$treat <- ifelse(trafsum_pm_lg$prov_code %in% treat_group_idx,1, 0)
trafsum_pm_lg$time_ind <- ifelse(trafsum_pm_lg$time >= 522, 1, 0)
trafsum_pm_lg$did <- trafsum_pm_lg$treat * trafsum_pm_lg$time_ind


othrsum_pm_lg <- collapseEvent(mydata.othr$death, mydata.othr$prov_code, 
							  mydata.othr$month, sum = TRUE)
othrsum_pm_lg$treat <- ifelse(othrsum_pm_lg$prov_code %in% treat_group_idx,1, 0)
othrsum_pm_lg$time_ind <- ifelse(othrsum_pm_lg$time >= 522, 1, 0)
othrsum_pm_lg$did <- othrsum_pm_lg$treat * othrsum_pm_lg$time_ind

prov.month <- list(all = deathsum_pm_lg,
				  traffic = trafsum_pm_lg,
				  others = othrsum_pm_lg)

t.var <- c("did", "time_ind", "treat")
formula <- as.formula(paste("death ~", paste(t.var, collapse = " + "), 
					        "+ factor(time) + factor(prov_code)"))

## Prefecture-Year

estimate.pfy <- NULL
SE.pfy <- NULL
mod.out.pfy <- list()

for(i in 1:length(pref.year)){
	mod <- lm_robust(formula, cluster = prov_code, data = pref.year[[i]])
	estimate.pfy <- c(estimate.pfy, coefficients(mod)['did'])
	SE.pfy <- c(SE.pfy, summary(mod)$coefficients['did', 2])
	mod.out.pfy <- append(mod.out.pfy, list(mod))
}



## Province-month

estimate.pm <- NULL
SE.pm <- NULL
mod.out.pm <- list()

for(i in 1:length(prov.month)){
	mod <- lm_robust(formula, cluster = prov_code, data = prov.month[[i]])
	estimate.pm <- c(estimate.pm, coefficients(mod)['did'])
	SE.pm <- c(SE.pm, summary(mod)$coefficients['did', 2])
	mod.out.pm <- append(mod.out.pm, list(mod))
}


## Plot Fig 7 (b): prefecture-year DID 

#pdf("f7_prefYear_did.pdf", family = "Times New Roman", 
#	width = 3.4, height = 3)
	y.ind <- c(1.2, 1.4, 1.6)
	y.ind <- rev(y.ind)

	par(mar= c(3,5,0,1), mgp=c(2, 0.5, 0))
	plot(NA, xlim = c(-0.7, 0.7), 
	         ylim = c(1, 1.8), 
	         type = "n", 
	         xlab = "DD Coefficient", 
	         ylab = "", axes = F, cex.lab = 1.1)
	segments(x0 = estimate.pfy - qnorm(0.95) * SE.pfy, 
	         y0 = y.ind, 
	         x1 = estimate.pfy + qnorm(0.95) * SE.pfy, 
	         y1 = y.ind, 
	         col = alpha("black", 0.45), lwd = 6)
	segments(x0 = estimate.pfy - qnorm(0.975) * SE.pfy, 
	         y0 = y.ind, 
	         x1 = estimate.pfy + qnorm(0.975) * SE.pfy, 
	         y1 = y.ind, 
	         col = "black", lwd = 2)
	points(estimate.pfy, y.ind, cex = 2, pch = 19, 
	       col = "black", bg = "black")
	axis(1, at = seq(-0.5, 0.5, 0.5), cex.axis = 1, las = 1)
	axis(2, at = y.ind, 
			labels = c("Aggregate", "Traffic", "Others"), 
			las = 2, cex.axis = 1, tick = F)
	abline(v = 0, col = "gray", lty = 2, lwd = 0.8)
	legend("bottomright", title = "CI", 
		   legend = c("95%", "90%"), 
		   col = c("black", alpha("black", 0.45)), 
		   lty = c(1, 1), cex = 0.8,
		   lwd = c(1, 4))
#dev.off()

## Plot Fig 9 (a): parallel trend check: prefecture-year DID 


deathsum_pfy_tr <- dataCollapse(pref.year[[1]]$death, 
								pref.year[[1]]$time, pref.year[[1]]$treat)
trafsum_pfy_tr <- dataCollapse(pref.year[[2]]$death, 
							   pref.year[[2]]$time, pref.year[[2]]$treat)
trafsum_pfy_tr <- rbind(rep(NA, 2), trafsum_pfy_tr)
othrsum_pfy_tr <- dataCollapse(pref.year[[3]]$death, 
							   pref.year[[3]]$time, pref.year[[3]]$treat)



#pdf("f7_paraCheck.pdf", family = "Times New Roman", width = 6, height = 2)
par(mfrow = c(1, 3), mgp=c(2.2, 0.8, 0))
paraPlot(deathsum_pfy_tr, "Time", "Aggregate",
		hline = 4, x.axis.vec = seq(2000, 2005, 1), 
		y.lab = TRUE, y.lab.names = "Log (Deaths)",
		y.min = 1.5, y.max = 3.5,
		legend = FALSE, leg.pos = "bottomright")
paraPlot(trafsum_pfy_tr, "Time", "Traffic",
		hline = 4, x.axis.vec = seq(2000, 2005, 1), 
		y.lab = FALSE, y.lab.names = "Log (Deaths)",
		y.min = 1.5, y.max = 3.5, l.mar = 2, 
		legend = FALSE, leg.pos = "bottomright")
paraPlot(othrsum_pfy_tr, "Time", "Others",
		hline = 4, x.axis.vec = seq(2000, 2005, 1), 
		y.lab = FALSE, y.lab.names = "Log (Deaths)",
		y.min = 1.5, y.max = 3.5, l.mar = 2,
		legend = FALSE, leg.pos = "bottomright")
#dev.off()

## 

gm <- list(
  list(raw = "r.squared", clean = "R2", fmt = 3),
  list(raw = "adj.r.squared", clean = "R2 Adj.", fmt = 3),
  list(raw = "se_type", clean = "Std.Errors", fmt = 3),
  list(raw = "nobs", clean = "Observations", fmt = 0)
)


names(mod.out.pfy) <- c("Death (all)", "Death (road)", "Death (others)")
names(mod.out.pm) <- c("Death (all)", "Death (road)", "Death (others)")


print(modelsummary(mod.out.pfy, 
	  coef_map = c("did", "time_ind", "treat"), 
	  gof_map = gm,
	  output = "table.tex",
	  stars = FALSE))

print(modelsummary(mod.out.pm, 
	  coef_map = c("did", "time_ind", "treat"), 
	  gof_map = gm,
	  output = "table.tex",
	  stars = FALSE))


## check missingness



summary(deathsum_pfy_lg$death)
sd(deathsum_pfy_lg$death, na.rm = T)
sd(mydata$death, na.rm = T)
summary(trafsum_pfy_lg$death)
sd(trafsum_pfy_lg$death, na.rm = T)
sd(mydata.traf$death, na.rm = T)

summary(othrsum_pfy_lg$death)
sd(othrsum_pfy_lg$death, na.rm = T)
sd(mydata.othr$death, na.rm = T)


summary(deathsum_pm_lg$death)
summary(trafsum_pm_lg$death)
summary(othrsum_pm_lg$death)

sum(is.na(deathsum_pm_lg$death)) / dim(deathsum_pm_lg)[1]
sum(is.na(trafsum_pm_lg$death)) / dim(trafsum_pm_lg)[1]
sum(is.na(othrsum_pm_lg$death)) / dim(othrsum_pm_lg)[1]


panelview(death ~ 1, 
          data = deathsum_pfy_lg, index = c("unit","time"), 
          xlab = "Year", ylab = "Prefecture")
panelview(death ~ 1, 
          data = trafsum_pfy_lg, index = c("unit","time"), 
          xlab = "Year", ylab = "Prefecture")
panelview(death ~ 1, 
          data = othrsum_pfy_lg, index = c("unit","time"), 
          xlab = "Year", ylab = "Prefecture")

length(unique(deathsum_pfy_lg$unit))
length(unique(deathsum_pfy_lg$unit))
length(unique(deathsum_pfy_lg$unit))



## Prefecture-Month data

deathsum_pfmon_lg <- collapseEvent(mydata$death, mydata$pref_code, 
																	 mydata$month, 
																	 sum = TRUE)
deathsum_pfmon_lg$treat <- ifelse(deathsum_pfmon_lg$prov_code %in% 
																	treat_group_idx, 1, 0)
deathsum_pfmon_lg$time_ind <- ifelse(deathsum_pfmon_lg$time >= 2004, 1, 0)
deathsum_pfmon_lg$did <- deathsum_pfmon_lg$treat * deathsum_pfmon_lg$time_ind


trafsum_pfmon_lg <- collapseEvent(mydata.traf$death, mydata.traf$pref_code, 
							  									mydata.traf$month, sum = TRUE)
trafsum_pfmon_lg$treat <- ifelse(trafsum_pfmon_lg$prov_code %in% 
																	treat_group_idx, 1, 0)
trafsum_pfmon_lg$time_ind <- ifelse(trafsum_pfmon_lg$time >= 2004, 1, 0)
trafsum_pfmon_lg$did <- trafsum_pfmon_lg$treat * trafsum_pfmon_lg$time_ind


othrsum_pfmon_lg <- collapseEvent(mydata.othr$death, mydata.othr$pref_code, 
							    mydata.othr$month, sum = TRUE)
othrsum_pfmon_lg$treat <- ifelse(othrsum_pfmon_lg$prov_code %in% 
																 treat_group_idx, 1, 0)
othrsum_pfmon_lg$time_ind <- ifelse(othrsum_pfmon_lg$time >= 2004, 1, 0)
othrsum_pfmon_lg$did <- othrsum_pfmon_lg$treat * othrsum_pfmon_lg$time_ind


pref.month <- list(all = deathsum_pfmon_lg,
				  traffic = trafsum_pfmon_lg,
				  others = othrsum_pfmon_lg)


## Check missing values

#pdf("apx_pref_month.pdf", family = "Times New Roman", width = 6, height = 6)
panelview(Y = "death", type = "missing",
          data = deathsum_pfmon_lg, index = c("unit","time"), 
          xlab = "Month", ylab = "Prefecture", 
          color = c(alpha("ivory2", 0.7), "mistyrose3"),
          axis.lab = "unit", background = "white")
#dev.off()
panelview(Y = "death", type = "missing",
          data = trafsum_pfmon_lg, index = c("unit","time"), 
          xlab = "Year", ylab = "Prefecture")
panelview(Y = "death", type = "missing",
          data = othrsum_pfmon_lg, index = c("unit","time"), 
          xlab = "Year", ylab = "Prefecture")

#packageVersion("panelview")

summary(deathsum_pfmon_lg$death)
sd(deathsum_pfmon_lg$death, na.rm = T)
sum(is.na(deathsum_pfmon_lg$death)) / dim(deathsum_pfmon_lg)[1]

summary(trafsum_pfmon_lg$death)
sd(trafsum_pfmon_lg$death, na.rm = T)
sum(is.na(trafsum_pfmon_lg$death)) / dim(trafsum_pfmon_lg)[1]

summary(othrsum_pfmon_lg$death)
sd(othrsum_pfmon_lg$death, na.rm = T)
sum(is.na(othrsum_pfmon_lg$death)) / dim(othrsum_pfmon_lg)[1]



